This notebook demonstrates how SyntheisFilters.jl works. Below we provide synthesized audio examples (Japanese) so that you are able to compare synthesis filters on your browser. Please read on.
In this notebook, the following synthesis fileters are demonstrated.
In [1]:
using PyCall
matplotlib = pyimport("matplotlib")
PyDict(matplotlib["rcParams"])["figure.figsize"] = (12, 5)
using PyPlot
In [2]:
# https://gist.github.com/jfsantos/a39ed69a7894876f1e04#file-audiodisplay-jl
# Thanks, @jfsantos
include("AudioDisplay.jl")
Out[2]:
In [3]:
using WAV
using DSP
using MelGeneralizedCepstrums # to esimate spectral envelope parameters
using SynthesisFilters
In [4]:
# plotting utilities
function wavplot(x; label="a waveform", x_label="sample")
plot(1:endof(x), x, "b", label=label)
xlim(1, endof(x))
xlabel(x_label)
legend()
end
function wavcompare(x, y; label="synthesized waveform", x_label="sample")
plot(1:endof(y), y, "r-+", label=label)
plot(1:endof(x), x, label="original speech signal")
xlim(1, endof(x))
xlabel(x_label)
legend()
end
Out[4]:
In [5]:
x, fs = wavread(joinpath(dirname(@__FILE__), "data", "test16k.wav"), format="native")
x = convert(Vector{Float64}, vec(x))
fs = convert(Int, fs)
wavplot(x)
inline_audioplayer(map(Int16, x), fs)
In [6]:
# Note about excitation
# fs: 16000
# frame period: 5.0 ms
# F0 analysis: esimated by WORLD.dio and WORLD.stonemask
# Excitation genereration: perioic pulse for voiced segments and gaussian random
# values for un-voiced segments
base_excitation = vec(readdlm(joinpath(dirname(@__FILE__), "data", "test16k_excitation.txt")))
wavplot(base_excitation)
inline_audioplayer(base_excitation ./ maximum(base_excitation), fs)
In [7]:
framelen = 512
hopsize = 80 # 5.0 ms for fs 16000
noverlap = framelen - hopsize
# Note that mgcep analysis basically assumes power-normalized window so that Σₙ w(n)² = 1
win = DSP.blackman(framelen) ./ sqrt(sumabs2(DSP.blackman(framelen)))
@assert isapprox(sumabs2(win), 1.0)
# create windowed signal matrix that each column represents a windowed time slice
as = arraysplit(x, framelen, noverlap)
xw = Array(Float64, framelen, length(as))
for t=1:length(as)
xw[:,t] = as[t]
end
# col-wise windowing
xw .*= win;
@show size(xw)
Out[7]:
In [8]:
c = estimate(MelCepstrum(20, mcepalpha(fs)), xw)
imshow(c, origin="lower", aspect="auto")
colorbar()
Out[8]:
In [9]:
# Let's see spectral envelope estimate
imshow(real(mgc2sp(c, framelen)), origin="lower", aspect="auto")
colorbar()
Out[9]:
In [10]:
c = estimate(LinearCepstrum(25), xw)
y = synthesis(base_excitation, c, hopsize)
wavcompare(x, y, label="Cepstrum-based synthesized waveform")
inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs)
In [11]:
c = estimate(MelCepstrum(25, mcepalpha(fs)), xw)
y = synthesis(base_excitation, c, hopsize)
wavcompare(x, y, label="Mel-cepstrum-based synthesized waveform")
inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs)
In [12]:
c = estimate(MelGeneralizedCepstrum(25, mcepalpha(fs), -1/4), xw)
y = synthesis(base_excitation, c, hopsize)
wavcompare(x, y, label="Mel-generalized cepstrum based synthesized waveform")
inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs)
In [13]:
l = estimate(LinearPredictionCoef(25), xw, use_mgcep=true)
y = synthesis(base_excitation, l, hopsize)
wavcompare(x, y, label="LPC-based synthesized waveform")
inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs)
In [14]:
l = lpc2par(estimate(LinearPredictionCoef(25), xw))
y = synthesis(base_excitation, l, hopsize)
wavcompare(x, y, label="PARCOR-based synthesized waveform")
inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs)
In [15]:
l = lpc2lsp(estimate(LinearPredictionCoef(15), xw))
y = synthesis(base_excitation, l, hopsize)
wavcompare(x, y, label="LSP-based synthesized waveform")
inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs)